library(ggplot2)
library(dplyr)
library(ggdist)
library(caret)
library(tidyverse)
library(plotly)
library(networkD3)
library(scales)
library(reshape)
library(klaR)
library(countrycode)
library(ISLR)
library(rpart)
library(rpart.plot)

set.seed(999)

Part 1: Data Analysis

Exploratory Data Analysis

df <- read.csv("salaries.csv", stringsAsFactors = TRUE)
summary(df)
##    work_year    experience_level employment_type
##  Min.   :2020   EN:147           CT:   8        
##  1st Qu.:2022   EX: 45           FL:   5        
##  Median :2022   MI:337           FT:1306        
##  Mean   :2022   SE:803           PT:  13        
##  3rd Qu.:2022                                   
##  Max.   :2022                                   
##                                                 
##                      job_title       salary         salary_currency
##  Data Scientist           :344   Min.   :    2324   USD    :1019   
##  Data Engineer            :320   1st Qu.:   80000   EUR    : 134   
##  Data Analyst             :190   Median :  130000   GBP    :  81   
##  Machine Learning Engineer: 86   Mean   :  237712   INR    :  40   
##  Analytics Engineer       : 43   3rd Qu.:  175100   CAD    :  21   
##  Data Architect           : 37   Max.   :30400000   AUD    :   8   
##  (Other)                  :312                      (Other):  29   
##  salary_in_usd    employee_residence  remote_ratio    company_location
##  Min.   :  2324   US     :910        Min.   :  0.00   US     :939     
##  1st Qu.: 75593   GB     : 83        1st Qu.:  0.00   GB     : 87     
##  Median :120000   IN     : 45        Median :100.00   CA     : 39     
##  Mean   :123375   CA     : 38        Mean   : 63.85   IN     : 36     
##  3rd Qu.:164997   DE     : 31        3rd Qu.:100.00   DE     : 35     
##  Max.   :600000   ES     : 29        Max.   :100.00   ES     : 28     
##                   (Other):196                         (Other):168     
##  company_size
##  L:322       
##  M:898       
##  S:112       
##              
##              
##              
## 
str(df)
## 'data.frame':    1332 obs. of  11 variables:
##  $ work_year         : int  2022 2022 2022 2022 2022 2022 2022 2022 2022 2022 ...
##  $ experience_level  : Factor w/ 4 levels "EN","EX","MI",..: 3 3 3 3 3 3 4 4 4 4 ...
##  $ employment_type   : Factor w/ 4 levels "CT","FL","FT",..: 3 3 3 3 3 3 3 3 3 3 ...
##  $ job_title         : Factor w/ 64 levels "3D Computer Vision Researcher",..: 47 47 31 31 31 31 22 22 31 31 ...
##  $ salary            : int  130000 90000 120000 100000 85000 78000 161000 110000 136000 104000 ...
##  $ salary_currency   : Factor w/ 18 levels "AUD","BRL","CAD",..: 18 18 18 18 18 18 18 18 18 18 ...
##  $ salary_in_usd     : int  130000 90000 120000 100000 85000 78000 161000 110000 136000 104000 ...
##  $ employee_residence: Factor w/ 64 levels "AE","AR","AT",..: 63 63 63 63 63 63 63 63 63 63 ...
##  $ remote_ratio      : int  0 0 100 100 100 100 100 100 100 100 ...
##  $ company_location  : Factor w/ 59 levels "AE","AL","AR",..: 58 58 58 58 58 58 58 58 58 58 ...
##  $ company_size      : Factor w/ 3 levels "L","M","S": 2 2 2 2 2 2 2 2 2 2 ...
anyNA(df)
## [1] FALSE
df$salary <- NULL

df$remote_ratio <- as.factor(df$remote_ratio)
levels(df$remote_ratio) <- list("Office" = 0, "Hybrid" = 50,  "Remote" = 100)

df$salary_group <- cut(df$salary_in_usd, c(0, 25000, 50000, 75000, 100000, 125000, 150000, 200000, 250000, 1000000))
df$salary_group <- as.factor(df$salary_group)
levels(df$salary_group) <- c("25k", "50k", "75k", "100k", "125k", "150k", "200k", "250k", "1M")
# group df$employee_residence to different regions in the world. The data is in ISO 3166-1 alpha-2 format. Regions like North America, Europe, Asia, etc.

northAmerica <-c("US", "CA", "MX")
southAmerica <- c("AR", "BO", "BR", "CL", "CO")
centralAmerica <- c("CR", "DO", "HN", "PR")
europe <- c("AL", "AT", "AX", "BE", "BG", "CH", "CZ", "DE", "DK", "EE", "ES", "FI", "FR", "GB", "GR", "HR", "HU", "IE", "IS", "IT", "JE", "LU", "MD", "MT", "NL", "PL", "PT", "RO", "RS", "RU", "SI", "TR", "UA")
asia <- c("AE", "CN", "HK", "ID", "IL", "IN", "IQ", "IR", "JP", "MY", "PH", "PK", "QA", "SG", "TH", "VN")
africa <- c("DZ", "EG", "KE", "NG", "TN", "TZ")
oceania <- c("AU", "NZ", "AS")

df$employee_region <- as.factor(df$employee_residence)
levels(df$employee_region) <- list(  
  "North America" = northAmerica,   
  "South America" = southAmerica,   
  "Central America" = centralAmerica,
  "Europe" = europe,
  "Asia" = asia,   
  "Africa" = africa, 
  "Oceania" = oceania)

df$company_region <- as.factor(df$company_location)
levels(df$company_region) <- list(  
  "North America" = northAmerica,   
  "South America" = southAmerica,   
  "Central America" = centralAmerica,
  "Europe" = europe,
  "Asia" = asia,   
  "Africa" = africa, 
  "Oceania" = oceania)
aux <- data.frame(matrix(0, nrow = length(levels(df$company_region)), ncol = length(levels(df$company_region))))
colnames(aux) <- levels(df$company_region)
rownames(aux) <- levels(df$company_region)

for (i in 1:nrow(aux)) {
  for (j in 1:ncol(aux)) {
    aux[i, j] <- length(which(df$company_region == colnames(aux)[j] & df$employee_region == rownames(aux)[i]))
  }
}

data_long <- aux %>%
  rownames_to_column %>%
  gather(key = 'key', value = 'value', -rowname) %>%
  filter(value > 0)
colnames(data_long) <- c("source", "target", "value")
data_long$target <- paste(data_long$target, " ", sep="")

nodes <- data.frame(name=c(as.character(data_long$source), as.character(data_long$target)) %>% unique())
 
data_long$IDsource=match(data_long$source, nodes$name)-1 
data_long$IDtarget=match(data_long$target, nodes$name)-1

sankeyNetwork(Links = data_long, Nodes = nodes,
                     Source = "IDsource", Target = "IDtarget",
                     Value = "value", NodeID = "name", 
                     sinksRight=FALSE, nodeWidth=30, fontSize=14, nodePadding=15)
levels(factor(df$job_title))  
##  [1] "3D Computer Vision Researcher"           
##  [2] "AI Scientist"                            
##  [3] "Analytics Engineer"                      
##  [4] "Applied Data Scientist"                  
##  [5] "Applied Machine Learning Scientist"      
##  [6] "Applied Scientist"                       
##  [7] "BI Analyst"                              
##  [8] "BI Data Analyst"                         
##  [9] "Big Data Architect"                      
## [10] "Big Data Engineer"                       
## [11] "Business Data Analyst"                   
## [12] "Cloud Data Architect"                    
## [13] "Cloud Data Engineer"                     
## [14] "Computer Vision Engineer"                
## [15] "Computer Vision Software Engineer"       
## [16] "Data Analyst"                            
## [17] "Data Analytics Consultant"               
## [18] "Data Analytics Engineer"                 
## [19] "Data Analytics Lead"                     
## [20] "Data Analytics Manager"                  
## [21] "Data Architect"                          
## [22] "Data Engineer"                           
## [23] "Data Engineering Manager"                
## [24] "Data Manager"                            
## [25] "Data Operations Analyst"                 
## [26] "Data Operations Engineer"                
## [27] "Data Science Consultant"                 
## [28] "Data Science Engineer"                   
## [29] "Data Science Lead"                       
## [30] "Data Science Manager"                    
## [31] "Data Scientist"                          
## [32] "Data Scientist Lead"                     
## [33] "Data Specialist"                         
## [34] "Director of Data Engineering"            
## [35] "Director of Data Science"                
## [36] "ETL Developer"                           
## [37] "Finance Data Analyst"                    
## [38] "Financial Data Analyst"                  
## [39] "Head of Data"                            
## [40] "Head of Data Science"                    
## [41] "Head of Machine Learning"                
## [42] "Lead Data Analyst"                       
## [43] "Lead Data Engineer"                      
## [44] "Lead Data Scientist"                     
## [45] "Lead Machine Learning Engineer"          
## [46] "Machine Learning Developer"              
## [47] "Machine Learning Engineer"               
## [48] "Machine Learning Infrastructure Engineer"
## [49] "Machine Learning Manager"                
## [50] "Machine Learning Research Engineer"      
## [51] "Machine Learning Scientist"              
## [52] "Marketing Data Analyst"                  
## [53] "ML Engineer"                             
## [54] "NLP Engineer"                            
## [55] "Power BI Developer"                      
## [56] "Principal Data Analyst"                  
## [57] "Principal Data Architect"                
## [58] "Principal Data Engineer"                 
## [59] "Principal Data Scientist"                
## [60] "Product Data Analyst"                    
## [61] "Product Data Scientist"                  
## [62] "Research Engineer"                       
## [63] "Research Scientist"                      
## [64] "Staff Data Scientist"
df$industry <- ifelse( grepl("data", df$job_title, ignore.case = T), "DATA", "OTHER")
df$industry <- ifelse( grepl("machine", df$job_title, ignore.case = T), "ML/AI", df$industry)
df$industry <- ifelse( grepl("ML", df$job_title, ignore.case = T), "ML/AI", df$industry)
df$industry <- ifelse( grepl("AI", df$job_title, ignore.case = T), "ML/AI", df$industry)
df$industry <- as.factor(df$industry)

df$boss <- ifelse(grepl("head", df$job_title, ignore.case = T), TRUE, FALSE)
df$boss <- ifelse(grepl("lead", df$job_title, ignore.case = T), TRUE, df$boss)
df$boss <- ifelse(grepl("principal", df$job_title, ignore.case = T), TRUE, df$boss)
df$boss <- ifelse(grepl("director", df$job_title, ignore.case = T), TRUE, df$boss)

df$role <- ifelse(grepl("scientist", df$job_title, ignore.case = T), "SCIENTIST", "OTHER")
df$role <- ifelse(grepl("engineer", df$job_title, ignore.case = T), "ENGINEER", df$role)
df$role <- ifelse(grepl("analyst", df$job_title, ignore.case = T), "ANALYST", df$role)
df$role <- ifelse(grepl("manager", df$job_title, ignore.case = T), "MANAGER", df$role)
df$role <- ifelse(grepl("architect", df$job_title, ignore.case = T), "ARCHITECT", df$role)
df$role <- ifelse(grepl("developer", df$job_title, ignore.case = T), "DEVELOPER", df$role)
df$role <- as.factor(df$role)

df$research <- ifelse(grepl("research", df$job_title, ignore.case = T), TRUE, FALSE)

df$job_title[df$job_title == "ML Engineer"] <- "Machine Learning Engineer"

summary(df)
##    work_year    experience_level employment_type
##  Min.   :2020   EN:147           CT:   8        
##  1st Qu.:2022   EX: 45           FL:   5        
##  Median :2022   MI:337           FT:1306        
##  Mean   :2022   SE:803           PT:  13        
##  3rd Qu.:2022                                   
##  Max.   :2022                                   
##                                                 
##                      job_title   salary_currency salary_in_usd   
##  Data Scientist           :344   USD    :1019    Min.   :  2324  
##  Data Engineer            :320   EUR    : 134    1st Qu.: 75593  
##  Data Analyst             :190   GBP    :  81    Median :120000  
##  Machine Learning Engineer:102   INR    :  40    Mean   :123375  
##  Analytics Engineer       : 43   CAD    :  21    3rd Qu.:164997  
##  Data Architect           : 37   AUD    :   8    Max.   :600000  
##  (Other)                  :296   (Other):  29                    
##  employee_residence remote_ratio company_location company_size  salary_group
##  US     :910        Office:412   US     :939      L:322        200k   :283  
##  GB     : 83        Hybrid:139   GB     : 87      M:898        150k   :231  
##  IN     : 45        Remote:781   CA     : 39      S:112        100k   :201  
##  CA     : 38                     IN     : 36                   75k    :155  
##  DE     : 31                     DE     : 35                   125k   :155  
##  ES     : 29                     ES     : 28                   250k   :104  
##  (Other):196                     (Other):168                   (Other):203  
##         employee_region         company_region  industry       boss        
##  North America  :951    North America  :982    DATA :1076   Mode :logical  
##  South America  : 26    South America  : 18    ML/AI: 157   FALSE:1276     
##  Central America:  8    Central America:  5    OTHER:  99   TRUE :56       
##  Europe         :253    Europe         :247                                
##  Asia           : 77    Asia           : 62                                
##  Africa         :  7    Africa         :  6                                
##  Oceania        : 10    Oceania        : 12                                
##         role      research      
##  ANALYST  :232   Mode :logical  
##  ARCHITECT: 40   FALSE:1306     
##  DEVELOPER: 16   TRUE :26       
##  ENGINEER :527                  
##  MANAGER  : 50                  
##  OTHER    : 41                  
##  SCIENTIST:426
mean(df$salary_in_usd)
## [1] 123374.7
mean(df[df$boss == FALSE, 'salary_in_usd'])
## [1] 121704.5
mean(df[df$boss == TRUE, 'salary_in_usd'])
## [1] 161430.6
numerics_names <- colnames(df[, sapply(df, is.numeric)])
factors_names <- colnames(df[, sapply(df, is.factor)])

for (i in 1:length(numerics_names)){
  print(ggplot(df, aes_string(x= numerics_names[i])) +
    ggdist::stat_halfeye(
      adjust=0.5,
      justification = -.2,
      .width=0
    ) +
    geom_boxplot(
      width = .1,
      alpha = 0.5
    ) +
     labs(title=paste("Distribution and boxplot of",numerics_names[i]), y="count"))
}
## Warning: `aes_string()` was deprecated in ggplot2 3.0.0.
## ℹ Please use tidy evaluation ideoms with `aes()`
## Warning: Using the `size` aesthietic with geom_segment was deprecated in ggplot2 3.4.0.
## ℹ Please use the `linewidth` aesthetic instead.

for (i in 1:length(factors_names)){
  print(ggplot(df, aes_string(x= factors_names[i])) +
    geom_bar()+
     labs(title=paste("Histogram of",factors_names[i]),y="count"))
}

plot(df$employee_residence, df$salary_in_usd, xlab="Employee Residence", ylab="Salary in USD")

plot(df$company_size, df$salary_in_usd, xlab="Company Size", ylab="Salary in USD")

plot(df$company_location, df$salary_in_usd, xlab="Company Location", ylab="Salary in USD")

plot(df$experience_level, df$salary_in_usd, xlab="Experience Level", ylab="Salary in USD")

plot(df$remote_ratio, df$salary_in_usd, xlab="Remote Ratio", ylab="Salary in USD")

y_2020 <- df[df$work_year == 2020,]
y_2021 <- df[df$work_year == 2021,]
y_2022 <- df[df$work_year == 2022,]

y_2020 <- y_2020 %>% group_by(remote_ratio) %>% summarise(count = n())
y_2021 <- y_2021 %>% group_by(remote_ratio) %>% summarise(count = n())
y_2022 <- y_2022 %>% group_by(remote_ratio) %>% summarise(count = n())

y_2020$percentage <- y_2020$count / sum(y_2020$count) * 100
y_2021$percentage <- y_2021$count / sum(y_2021$count) * 100
y_2022$percentage <- y_2022$count / sum(y_2022$count) * 100

y_2020$work_year <- 2020
y_2021$work_year <- 2021
y_2022$work_year <- 2022

y <- rbind(y_2020, y_2021, y_2022)

ggplot(y, aes(x = work_year, y = percentage, group = remote_ratio, color = remote_ratio)) +
  geom_line()+
  geom_point() + 
  labs(x = "Work Year", y = "Percentage of Remote Ratio", title = "Remote Ratio by Work Year") +
  theme(plot.title = element_text(hjust = 0.5), axis.text=element_text(size=12),
        axis.title=element_text(size=12)) +
  scale_x_continuous(breaks = c(2020, 2021, 2022))

Part 2: Context of the problem, Models and Evaluation

1. Profile-based prediction and maximization of compensation

#First we remove some columns that wont bring useful information to the model:

dfTreeRegression <- df
dfTreeRegression$salary_currency <- NULL

#We are going to try to predict salary with a regression tree. We are going to eliminate the created "salary_group" column because otherwise it would be kinda cheating since its the value that we are trying to predict.

dfTreeRegression$salary_group <- NULL

#After making a couple of tests, we saw that these variables are never used by the tree to make predictions, so we are going to remove them from the set so getting the best options for our profiles is easier later:

dfTreeRegression$company_region <- NULL
dfTreeRegression$employee_region <- NULL
dfTreeRegression$work_year <- NULL
dfTreeRegression$research <- NULL
dfTreeRegression$employment_type <- NULL
dfTreeRegression$role <- NULL
dfTreeRegression$boss <- NULL

set.seed(42)
sample <- sample(c(TRUE, FALSE), nrow(dfTreeRegression), replace=TRUE, prob=c(0.8,0.2))
Tree_train  <- dfTreeRegression[sample, ]
Tree_test   <- dfTreeRegression[!sample, ]
#Con el parametro "cp", cuanto mas proximo a 0, menos poda hace.
fit <- rpart(salary_in_usd~., data = Tree_train, method="anova", control = list(cp = 0.0001))
rules <- rpart.rules(fit)

test_pred <- predict(fit, Tree_test, method="anova")
results <- data.frame(test_pred, Tree_test$salary_in_usd)
results$residuals <- results$Tree_test.salary_in_usd-results$test_pred
plot(results$residuals)
abline(0,0, col="red")
resmean <- mean(results$residuals)
abline(resmean,0, col="blue")

sd(results$residuals)
## [1] 57435.01
nrow(rules)
## [1] 51
datos_monica <- Tree_test[1,]
datos_monica[1,] = list("MI", "Data Scientist",0, "ES", "Hybrid", "ES", "S", "DATA")
options(width=1000)

salarios = rpart.predict(fit, datos_monica)
salarios
##        1 
## 40856.43
wallis_grid <- expand.grid(
  experience_level = c("EN"),
  job_title = c("ML Engineer"),
  salary_in_usd = 0,
  employee_residence = c("US"),
  remote_ratio = levels(df$remote_ratio),
  company_location = c("US"),
  company_size = levels(df$company_size),
  industry = c("ML/AI")
)
wallis_grid
##   experience_level   job_title salary_in_usd employee_residence remote_ratio company_location company_size industry
## 1               EN ML Engineer             0                 US       Office               US            L    ML/AI
## 2               EN ML Engineer             0                 US       Hybrid               US            L    ML/AI
## 3               EN ML Engineer             0                 US       Remote               US            L    ML/AI
## 4               EN ML Engineer             0                 US       Office               US            M    ML/AI
## 5               EN ML Engineer             0                 US       Hybrid               US            M    ML/AI
## 6               EN ML Engineer             0                 US       Remote               US            M    ML/AI
## 7               EN ML Engineer             0                 US       Office               US            S    ML/AI
## 8               EN ML Engineer             0                 US       Hybrid               US            S    ML/AI
## 9               EN ML Engineer             0                 US       Remote               US            S    ML/AI
prediction <- rpart.predict(fit, wallis_grid)
wallis_max <- max(prediction)
wallis_best_combinations_index <- which(wallis_max == prediction)
wallis_best_combinations <- wallis_grid[wallis_best_combinations_index,]
wallis_best_combinations
##   experience_level   job_title salary_in_usd employee_residence remote_ratio company_location company_size industry
## 4               EN ML Engineer             0                 US       Office               US            M    ML/AI
## 5               EN ML Engineer             0                 US       Hybrid               US            M    ML/AI
## 6               EN ML Engineer             0                 US       Remote               US            M    ML/AI
## 7               EN ML Engineer             0                 US       Office               US            S    ML/AI
## 8               EN ML Engineer             0                 US       Hybrid               US            S    ML/AI
## 9               EN ML Engineer             0                 US       Remote               US            S    ML/AI
wallis_max
## [1] 102330.8
dawid_grid <- expand.grid(
  experience_level = c("MI"),
  job_title = c("ML Engineer"),
  salary_in_usd = 0,
  employee_residence = c("ES"),
  remote_ratio = c("Remote"),
  company_location = levels(df$company_location),
  company_size = levels(df$company_size),
  industry = c("ML/AI")
)
dawid_grid
##     experience_level   job_title salary_in_usd employee_residence remote_ratio company_location company_size industry
## 1                 MI ML Engineer             0                 ES       Remote               AE            L    ML/AI
## 2                 MI ML Engineer             0                 ES       Remote               AL            L    ML/AI
## 3                 MI ML Engineer             0                 ES       Remote               AR            L    ML/AI
## 4                 MI ML Engineer             0                 ES       Remote               AS            L    ML/AI
## 5                 MI ML Engineer             0                 ES       Remote               AT            L    ML/AI
## 6                 MI ML Engineer             0                 ES       Remote               AU            L    ML/AI
## 7                 MI ML Engineer             0                 ES       Remote               BE            L    ML/AI
## 8                 MI ML Engineer             0                 ES       Remote               BO            L    ML/AI
## 9                 MI ML Engineer             0                 ES       Remote               BR            L    ML/AI
## 10                MI ML Engineer             0                 ES       Remote               CA            L    ML/AI
## 11                MI ML Engineer             0                 ES       Remote               CH            L    ML/AI
## 12                MI ML Engineer             0                 ES       Remote               CL            L    ML/AI
## 13                MI ML Engineer             0                 ES       Remote               CN            L    ML/AI
## 14                MI ML Engineer             0                 ES       Remote               CO            L    ML/AI
## 15                MI ML Engineer             0                 ES       Remote               CZ            L    ML/AI
## 16                MI ML Engineer             0                 ES       Remote               DE            L    ML/AI
## 17                MI ML Engineer             0                 ES       Remote               DK            L    ML/AI
## 18                MI ML Engineer             0                 ES       Remote               DZ            L    ML/AI
## 19                MI ML Engineer             0                 ES       Remote               EE            L    ML/AI
## 20                MI ML Engineer             0                 ES       Remote               EG            L    ML/AI
## 21                MI ML Engineer             0                 ES       Remote               ES            L    ML/AI
## 22                MI ML Engineer             0                 ES       Remote               FI            L    ML/AI
## 23                MI ML Engineer             0                 ES       Remote               FR            L    ML/AI
## 24                MI ML Engineer             0                 ES       Remote               GB            L    ML/AI
## 25                MI ML Engineer             0                 ES       Remote               GR            L    ML/AI
## 26                MI ML Engineer             0                 ES       Remote               HN            L    ML/AI
## 27                MI ML Engineer             0                 ES       Remote               HR            L    ML/AI
## 28                MI ML Engineer             0                 ES       Remote               HU            L    ML/AI
## 29                MI ML Engineer             0                 ES       Remote               ID            L    ML/AI
## 30                MI ML Engineer             0                 ES       Remote               IE            L    ML/AI
## 31                MI ML Engineer             0                 ES       Remote               IL            L    ML/AI
## 32                MI ML Engineer             0                 ES       Remote               IN            L    ML/AI
## 33                MI ML Engineer             0                 ES       Remote               IQ            L    ML/AI
## 34                MI ML Engineer             0                 ES       Remote               IR            L    ML/AI
## 35                MI ML Engineer             0                 ES       Remote               IT            L    ML/AI
## 36                MI ML Engineer             0                 ES       Remote               JP            L    ML/AI
## 37                MI ML Engineer             0                 ES       Remote               KE            L    ML/AI
## 38                MI ML Engineer             0                 ES       Remote               LU            L    ML/AI
## 39                MI ML Engineer             0                 ES       Remote               MD            L    ML/AI
## 40                MI ML Engineer             0                 ES       Remote               MT            L    ML/AI
## 41                MI ML Engineer             0                 ES       Remote               MX            L    ML/AI
## 42                MI ML Engineer             0                 ES       Remote               MY            L    ML/AI
## 43                MI ML Engineer             0                 ES       Remote               NG            L    ML/AI
## 44                MI ML Engineer             0                 ES       Remote               NL            L    ML/AI
## 45                MI ML Engineer             0                 ES       Remote               NZ            L    ML/AI
## 46                MI ML Engineer             0                 ES       Remote               PH            L    ML/AI
## 47                MI ML Engineer             0                 ES       Remote               PK            L    ML/AI
## 48                MI ML Engineer             0                 ES       Remote               PL            L    ML/AI
## 49                MI ML Engineer             0                 ES       Remote               PR            L    ML/AI
## 50                MI ML Engineer             0                 ES       Remote               PT            L    ML/AI
## 51                MI ML Engineer             0                 ES       Remote               RO            L    ML/AI
## 52                MI ML Engineer             0                 ES       Remote               RU            L    ML/AI
## 53                MI ML Engineer             0                 ES       Remote               SG            L    ML/AI
## 54                MI ML Engineer             0                 ES       Remote               SI            L    ML/AI
## 55                MI ML Engineer             0                 ES       Remote               TH            L    ML/AI
## 56                MI ML Engineer             0                 ES       Remote               TR            L    ML/AI
## 57                MI ML Engineer             0                 ES       Remote               UA            L    ML/AI
## 58                MI ML Engineer             0                 ES       Remote               US            L    ML/AI
## 59                MI ML Engineer             0                 ES       Remote               VN            L    ML/AI
## 60                MI ML Engineer             0                 ES       Remote               AE            M    ML/AI
## 61                MI ML Engineer             0                 ES       Remote               AL            M    ML/AI
## 62                MI ML Engineer             0                 ES       Remote               AR            M    ML/AI
## 63                MI ML Engineer             0                 ES       Remote               AS            M    ML/AI
## 64                MI ML Engineer             0                 ES       Remote               AT            M    ML/AI
## 65                MI ML Engineer             0                 ES       Remote               AU            M    ML/AI
## 66                MI ML Engineer             0                 ES       Remote               BE            M    ML/AI
## 67                MI ML Engineer             0                 ES       Remote               BO            M    ML/AI
## 68                MI ML Engineer             0                 ES       Remote               BR            M    ML/AI
## 69                MI ML Engineer             0                 ES       Remote               CA            M    ML/AI
## 70                MI ML Engineer             0                 ES       Remote               CH            M    ML/AI
## 71                MI ML Engineer             0                 ES       Remote               CL            M    ML/AI
## 72                MI ML Engineer             0                 ES       Remote               CN            M    ML/AI
## 73                MI ML Engineer             0                 ES       Remote               CO            M    ML/AI
## 74                MI ML Engineer             0                 ES       Remote               CZ            M    ML/AI
## 75                MI ML Engineer             0                 ES       Remote               DE            M    ML/AI
## 76                MI ML Engineer             0                 ES       Remote               DK            M    ML/AI
## 77                MI ML Engineer             0                 ES       Remote               DZ            M    ML/AI
## 78                MI ML Engineer             0                 ES       Remote               EE            M    ML/AI
## 79                MI ML Engineer             0                 ES       Remote               EG            M    ML/AI
## 80                MI ML Engineer             0                 ES       Remote               ES            M    ML/AI
## 81                MI ML Engineer             0                 ES       Remote               FI            M    ML/AI
## 82                MI ML Engineer             0                 ES       Remote               FR            M    ML/AI
## 83                MI ML Engineer             0                 ES       Remote               GB            M    ML/AI
## 84                MI ML Engineer             0                 ES       Remote               GR            M    ML/AI
## 85                MI ML Engineer             0                 ES       Remote               HN            M    ML/AI
## 86                MI ML Engineer             0                 ES       Remote               HR            M    ML/AI
## 87                MI ML Engineer             0                 ES       Remote               HU            M    ML/AI
## 88                MI ML Engineer             0                 ES       Remote               ID            M    ML/AI
## 89                MI ML Engineer             0                 ES       Remote               IE            M    ML/AI
## 90                MI ML Engineer             0                 ES       Remote               IL            M    ML/AI
## 91                MI ML Engineer             0                 ES       Remote               IN            M    ML/AI
## 92                MI ML Engineer             0                 ES       Remote               IQ            M    ML/AI
## 93                MI ML Engineer             0                 ES       Remote               IR            M    ML/AI
## 94                MI ML Engineer             0                 ES       Remote               IT            M    ML/AI
## 95                MI ML Engineer             0                 ES       Remote               JP            M    ML/AI
## 96                MI ML Engineer             0                 ES       Remote               KE            M    ML/AI
## 97                MI ML Engineer             0                 ES       Remote               LU            M    ML/AI
## 98                MI ML Engineer             0                 ES       Remote               MD            M    ML/AI
## 99                MI ML Engineer             0                 ES       Remote               MT            M    ML/AI
## 100               MI ML Engineer             0                 ES       Remote               MX            M    ML/AI
## 101               MI ML Engineer             0                 ES       Remote               MY            M    ML/AI
## 102               MI ML Engineer             0                 ES       Remote               NG            M    ML/AI
## 103               MI ML Engineer             0                 ES       Remote               NL            M    ML/AI
## 104               MI ML Engineer             0                 ES       Remote               NZ            M    ML/AI
## 105               MI ML Engineer             0                 ES       Remote               PH            M    ML/AI
## 106               MI ML Engineer             0                 ES       Remote               PK            M    ML/AI
## 107               MI ML Engineer             0                 ES       Remote               PL            M    ML/AI
## 108               MI ML Engineer             0                 ES       Remote               PR            M    ML/AI
## 109               MI ML Engineer             0                 ES       Remote               PT            M    ML/AI
## 110               MI ML Engineer             0                 ES       Remote               RO            M    ML/AI
## 111               MI ML Engineer             0                 ES       Remote               RU            M    ML/AI
## 112               MI ML Engineer             0                 ES       Remote               SG            M    ML/AI
## 113               MI ML Engineer             0                 ES       Remote               SI            M    ML/AI
## 114               MI ML Engineer             0                 ES       Remote               TH            M    ML/AI
## 115               MI ML Engineer             0                 ES       Remote               TR            M    ML/AI
## 116               MI ML Engineer             0                 ES       Remote               UA            M    ML/AI
## 117               MI ML Engineer             0                 ES       Remote               US            M    ML/AI
## 118               MI ML Engineer             0                 ES       Remote               VN            M    ML/AI
## 119               MI ML Engineer             0                 ES       Remote               AE            S    ML/AI
## 120               MI ML Engineer             0                 ES       Remote               AL            S    ML/AI
## 121               MI ML Engineer             0                 ES       Remote               AR            S    ML/AI
## 122               MI ML Engineer             0                 ES       Remote               AS            S    ML/AI
## 123               MI ML Engineer             0                 ES       Remote               AT            S    ML/AI
## 124               MI ML Engineer             0                 ES       Remote               AU            S    ML/AI
## 125               MI ML Engineer             0                 ES       Remote               BE            S    ML/AI
## 126               MI ML Engineer             0                 ES       Remote               BO            S    ML/AI
## 127               MI ML Engineer             0                 ES       Remote               BR            S    ML/AI
## 128               MI ML Engineer             0                 ES       Remote               CA            S    ML/AI
## 129               MI ML Engineer             0                 ES       Remote               CH            S    ML/AI
## 130               MI ML Engineer             0                 ES       Remote               CL            S    ML/AI
## 131               MI ML Engineer             0                 ES       Remote               CN            S    ML/AI
## 132               MI ML Engineer             0                 ES       Remote               CO            S    ML/AI
## 133               MI ML Engineer             0                 ES       Remote               CZ            S    ML/AI
## 134               MI ML Engineer             0                 ES       Remote               DE            S    ML/AI
## 135               MI ML Engineer             0                 ES       Remote               DK            S    ML/AI
## 136               MI ML Engineer             0                 ES       Remote               DZ            S    ML/AI
## 137               MI ML Engineer             0                 ES       Remote               EE            S    ML/AI
## 138               MI ML Engineer             0                 ES       Remote               EG            S    ML/AI
## 139               MI ML Engineer             0                 ES       Remote               ES            S    ML/AI
## 140               MI ML Engineer             0                 ES       Remote               FI            S    ML/AI
## 141               MI ML Engineer             0                 ES       Remote               FR            S    ML/AI
## 142               MI ML Engineer             0                 ES       Remote               GB            S    ML/AI
## 143               MI ML Engineer             0                 ES       Remote               GR            S    ML/AI
## 144               MI ML Engineer             0                 ES       Remote               HN            S    ML/AI
## 145               MI ML Engineer             0                 ES       Remote               HR            S    ML/AI
## 146               MI ML Engineer             0                 ES       Remote               HU            S    ML/AI
## 147               MI ML Engineer             0                 ES       Remote               ID            S    ML/AI
## 148               MI ML Engineer             0                 ES       Remote               IE            S    ML/AI
## 149               MI ML Engineer             0                 ES       Remote               IL            S    ML/AI
## 150               MI ML Engineer             0                 ES       Remote               IN            S    ML/AI
## 151               MI ML Engineer             0                 ES       Remote               IQ            S    ML/AI
## 152               MI ML Engineer             0                 ES       Remote               IR            S    ML/AI
## 153               MI ML Engineer             0                 ES       Remote               IT            S    ML/AI
## 154               MI ML Engineer             0                 ES       Remote               JP            S    ML/AI
## 155               MI ML Engineer             0                 ES       Remote               KE            S    ML/AI
## 156               MI ML Engineer             0                 ES       Remote               LU            S    ML/AI
## 157               MI ML Engineer             0                 ES       Remote               MD            S    ML/AI
## 158               MI ML Engineer             0                 ES       Remote               MT            S    ML/AI
## 159               MI ML Engineer             0                 ES       Remote               MX            S    ML/AI
## 160               MI ML Engineer             0                 ES       Remote               MY            S    ML/AI
## 161               MI ML Engineer             0                 ES       Remote               NG            S    ML/AI
## 162               MI ML Engineer             0                 ES       Remote               NL            S    ML/AI
## 163               MI ML Engineer             0                 ES       Remote               NZ            S    ML/AI
## 164               MI ML Engineer             0                 ES       Remote               PH            S    ML/AI
## 165               MI ML Engineer             0                 ES       Remote               PK            S    ML/AI
## 166               MI ML Engineer             0                 ES       Remote               PL            S    ML/AI
## 167               MI ML Engineer             0                 ES       Remote               PR            S    ML/AI
## 168               MI ML Engineer             0                 ES       Remote               PT            S    ML/AI
## 169               MI ML Engineer             0                 ES       Remote               RO            S    ML/AI
## 170               MI ML Engineer             0                 ES       Remote               RU            S    ML/AI
## 171               MI ML Engineer             0                 ES       Remote               SG            S    ML/AI
## 172               MI ML Engineer             0                 ES       Remote               SI            S    ML/AI
## 173               MI ML Engineer             0                 ES       Remote               TH            S    ML/AI
## 174               MI ML Engineer             0                 ES       Remote               TR            S    ML/AI
## 175               MI ML Engineer             0                 ES       Remote               UA            S    ML/AI
## 176               MI ML Engineer             0                 ES       Remote               US            S    ML/AI
## 177               MI ML Engineer             0                 ES       Remote               VN            S    ML/AI
prediction <- rpart.predict(fit, dawid_grid)
dawid_max <- max(prediction)
dawid_best_combinations_index <- which(dawid_max == prediction)
dawid_best_combinations <- dawid_grid[dawid_best_combinations_index,]
dawid_best_combinations
##     experience_level   job_title salary_in_usd employee_residence remote_ratio company_location company_size industry
## 1                 MI ML Engineer             0                 ES       Remote               AE            L    ML/AI
## 5                 MI ML Engineer             0                 ES       Remote               AT            L    ML/AI
## 7                 MI ML Engineer             0                 ES       Remote               BE            L    ML/AI
## 13                MI ML Engineer             0                 ES       Remote               CN            L    ML/AI
## 18                MI ML Engineer             0                 ES       Remote               DZ            L    ML/AI
## 22                MI ML Engineer             0                 ES       Remote               FI            L    ML/AI
## 24                MI ML Engineer             0                 ES       Remote               GB            L    ML/AI
## 33                MI ML Engineer             0                 ES       Remote               IQ            L    ML/AI
## 36                MI ML Engineer             0                 ES       Remote               JP            L    ML/AI
## 52                MI ML Engineer             0                 ES       Remote               RU            L    ML/AI
## 54                MI ML Engineer             0                 ES       Remote               SI            L    ML/AI
## 58                MI ML Engineer             0                 ES       Remote               US            L    ML/AI
## 60                MI ML Engineer             0                 ES       Remote               AE            M    ML/AI
## 64                MI ML Engineer             0                 ES       Remote               AT            M    ML/AI
## 66                MI ML Engineer             0                 ES       Remote               BE            M    ML/AI
## 72                MI ML Engineer             0                 ES       Remote               CN            M    ML/AI
## 77                MI ML Engineer             0                 ES       Remote               DZ            M    ML/AI
## 81                MI ML Engineer             0                 ES       Remote               FI            M    ML/AI
## 83                MI ML Engineer             0                 ES       Remote               GB            M    ML/AI
## 92                MI ML Engineer             0                 ES       Remote               IQ            M    ML/AI
## 95                MI ML Engineer             0                 ES       Remote               JP            M    ML/AI
## 111               MI ML Engineer             0                 ES       Remote               RU            M    ML/AI
## 113               MI ML Engineer             0                 ES       Remote               SI            M    ML/AI
## 117               MI ML Engineer             0                 ES       Remote               US            M    ML/AI
## 119               MI ML Engineer             0                 ES       Remote               AE            S    ML/AI
## 123               MI ML Engineer             0                 ES       Remote               AT            S    ML/AI
## 125               MI ML Engineer             0                 ES       Remote               BE            S    ML/AI
## 131               MI ML Engineer             0                 ES       Remote               CN            S    ML/AI
## 136               MI ML Engineer             0                 ES       Remote               DZ            S    ML/AI
## 140               MI ML Engineer             0                 ES       Remote               FI            S    ML/AI
## 142               MI ML Engineer             0                 ES       Remote               GB            S    ML/AI
## 151               MI ML Engineer             0                 ES       Remote               IQ            S    ML/AI
## 154               MI ML Engineer             0                 ES       Remote               JP            S    ML/AI
## 170               MI ML Engineer             0                 ES       Remote               RU            S    ML/AI
## 172               MI ML Engineer             0                 ES       Remote               SI            S    ML/AI
## 176               MI ML Engineer             0                 ES       Remote               US            S    ML/AI
dawid_max
## [1] 109338.5

2. How can data mining techniques be used to identify which employees are at risk of leaving their company?

This could be useful for companies who want to retain their top talent and for employees that are underpaid. We could use different factors such to determine which employees are underpaid compared to their peers.

To answer this questions we will use two different techniques: Clustering and regression.

The following features will be used: - Experience level - Employment type - Industry - Employee region - Role - Remote_ratio - Boss - Research - Company size

Clustering

First we create a separate dataframe for the variables we will use.

# Select the variables to use in the clustering analysis
df_vars <- dplyr::select(df, experience_level, employment_type, company_size, industry, employee_region, boss, research, role, remote_ratio)
#                     
# df_vars$experience_level <- as.numeric(df_vars$experience_level)
# df_vars$industry <- as.numeric(df_vars$industry)
# df_vars$employment_type <- as.numeric(df_vars$employment_type)
# df_vars$company_size <- as.numeric(df_vars$company_size)
# df_vars$employee_region <- as.numeric(df_vars$employee_region)
# df_vars$role <- as.numeric(df_vars$role)
# df_vars$boss <- as.numeric(df_vars$boss)
# df_vars$research <- as.numeric(df_vars$research)
# df_vars$remote_ratio <- as.numeric(df_vars$remote_ratio)
# 
# # Standardize the variables
# df_vars <- scale(df_vars)

Let s find find the elbow in our plot, to find the “best” k for the kmodes algorithm

# results <- data.frame(k = integer(), WCSS = double())
# 
# # Loop through a range of values for k
# for (k in 2:40) {
#   # Run the k-means clustering algorithm
#   km <- kmodes(df_vars, k, iter.max = 50)
#   # Store the results in the data frame
#   results <- rbind(results, data.frame(k = k, WCSS = sum(km$withindiff^2)))
# }
# 
# # Plot the WCSS values for each value of k
# ggplot(results, aes(x = k, y = WCSS)) +
#   geom_line() +
#   geom_point() +
#   labs(x = "Number of clusters (k)", y = "WCSS")
# Perform k-means clustering with n clusters
kmodes_results <- kmodes(df_vars, 20)

df_Q7 <- df
# Add the cluster labels to the original data
df_Q7$cluster <- kmodes_results$cluster

# Inspect the clusters
group_by(df_Q7, cluster) %>%
  summarize(mean_salary = mean(salary_in_usd))
## # A tibble: 20 × 2
##    cluster mean_salary
##      <int>       <dbl>
##  1       1      65656.
##  2       2      89992.
##  3       3      34249.
##  4       4      84749.
##  5       5     258496.
##  6       6      75476.
##  7       7     146911.
##  8       8      47324.
##  9       9     118519.
## 10      10     151188.
## 11      11      76042.
## 12      12      88378.
## 13      13      55665.
## 14      14     148238.
## 15      15     157399.
## 16      16     100367.
## 17      17      78672.
## 18      18     142089.
## 19      19     151811.
## 20      20     126383.
fig1 <- plot_ly(df_Q7, x = ~as.numeric(employee_region), y = ~as.numeric(industry), z = ~as.numeric(experience_level), color = ~cluster)
fig1 <- fig1 %>% add_markers()

axx <- list(ticketmode = 'array', title="Employee region", ticktext = levels(df$employee_region),  tickvals = c(1,2,3,4,5,6,7),  range = c(1,7), tickangle = 25)
axy <- list(ticketmode = 'array', title="Industry", ticktext = levels(df$industry),  tickvals = c(1,2,3),  range = c(1,3), tickangle = 45)
axz <- list(ticketmode = 'array', title="Experience level", ticktext = levels(df$experience_level),  tickvals = c(1,2,3,4),  range = c(1,4), tickangle = 45)

fig1 <- fig1 %>% layout(scene = list(xaxis = axx, yaxis = axy, zaxis = axz))
fig1
fig2 <- plot_ly(df_Q7, x = ~as.numeric(role), y = ~as.numeric(industry), z = ~as.numeric(remote_ratio), color = ~cluster)
fig2 <- fig2 %>% add_markers()

axx <- list(ticketmode = 'array', title="Role", ticktext = levels(df$role),  tickvals = c(1,2,3,4,5,6,7),  range = c(1,7), tickangle = 25)
axy <- list(ticketmode = 'array', title="Industry", ticktext = levels(df$industry),  tickvals = c(1,2,3),  range = c(1,3), tickangle = 45)
axz <- list(ticketmode = 'array', title="Remote ratio", ticktext = levels(df$remote_ratio),  tickvals = c(1,2,3),  range = c(1,3), tickangle = 45)

fig2 <- fig2 %>% layout(scene = list(xaxis = axx, yaxis = axy, zaxis = axz))
fig2
fig3 <- plot_ly(df_Q7, x = ~as.numeric(role), y = ~as.numeric(industry), z = ~as.numeric(remote_ratio), color = ~cluster)
fig3 <- fig3 %>% add_markers()

axx <- list(ticketmode = 'array', title="Role", ticktext = levels(df$role),  tickvals = c(1,2,3,4,5,6,7),  range = c(1,7), tickangle = 25)
axy <- list(ticketmode = 'array', title="Industry", ticktext = levels(df$industry),  tickvals = c(1,2,3),  range = c(1,3), tickangle = 45)
axz <- list(ticketmode = 'array', title="Remote ratio", ticktext = levels(df$remote_ratio),  tickvals = c(1,2,3),  range = c(1,3), tickangle = 45)

fig3 <- fig3 %>% layout(scene = list(xaxis = axx, yaxis = axy, zaxis = axz))
fig3
mean_salary <- tapply(df_Q7$salary_in_usd, df_Q7$cluster, mean)
# View the mean salary for each level of the experience_level variable
df_Q7$salary_difference_kmodes <- 0
df_Q7$predicted_salary_kmodes <-0

for (i in 1:nrow(df_Q7)) {
  cluster_index = match(df_Q7$cluster[i], names(mean_salary))
  df_Q7$salary_difference_kmodes[i] <- df_Q7$salary_in_usd[i] - mean_salary[cluster_index]
  df_Q7$predicted_salary_kmodes[i] <- mean_salary[cluster_index]
}

underpaid_employees_kmodes <- df_Q7[df_Q7$salary_difference_kmodes < 0, ]
overpaid_employees_kmodes <- df_Q7[df_Q7$salary_difference_kmodes > 0, ]

underpaid_employees_kmodes <- underpaid_employees_kmodes[order(underpaid_employees_kmodes$salary_difference_kmodes), ]
overpaid_employees_kmodes <- overpaid_employees_kmodes[order(overpaid_employees_kmodes$salary_difference_kmodes, decreasing = TRUE), ]

# View the underpaid and overpaid employees
head(underpaid_employees_kmodes, n = 10)
##      work_year experience_level employment_type                     job_title salary_currency salary_in_usd employee_residence remote_ratio company_location company_size salary_group employee_region company_region industry  boss      role research cluster salary_difference_kmodes predicted_salary_kmodes
## 1183      2021               MI              FT          Head of Data Science             USD        110000                 US       Office               US            S         125k   North America  North America     DATA  TRUE     OTHER    FALSE       5                -148496.3                258496.3
## 113       2021               SE              FT 3D Computer Vision Researcher             USD          5000                 US       Remote               US            L          25k   North America  North America    OTHER FALSE     OTHER     TRUE      10                -146187.8                151187.8
## 754       2022               EX              FT            Lead Data Engineer             CAD        116073                 CA       Remote               CA            S         125k   North America  North America     DATA  TRUE  ENGINEER    FALSE       5                -142423.3                258496.3
## 1300      2021               SE              FT      Computer Vision Engineer             BRL         18907                 BR       Office               BR            M          25k   South America  South America    OTHER FALSE  ENGINEER    FALSE      19                -132904.5                151811.5
## 1045      2022               EN              FT       Data Analytics Engineer             USD         20000                 PK       Office               PK            M          25k            Asia           Asia     DATA FALSE  ENGINEER    FALSE      19                -131811.5                151811.5
## 612       2022               MI              FT                 Data Engineer             USD         24000                 US       Office               US            M          25k   North America  North America     DATA FALSE  ENGINEER    FALSE      19                -127811.5                151811.5
## 613       2022               MI              FT                 Data Engineer             USD         24000                 US       Office               US            M          25k   North America  North America     DATA FALSE  ENGINEER    FALSE      19                -127811.5                151811.5
## 958       2022               SE              FT                 Data Engineer             USD         25000                 US       Remote               US            M          25k   North America  North America     DATA FALSE  ENGINEER    FALSE      19                -126811.5                151811.5
## 604       2022               SE              FT                Data Scientist             BRL          2428                 BR       Remote               BR            L          25k   South America  South America     DATA FALSE SCIENTIST    FALSE      20                -123955.2                126383.2
## 1167      2020               SE              FT                 Data Engineer             MXN         33511                 MX       Office               MX            S          50k   North America  North America     DATA FALSE  ENGINEER    FALSE      19                -118300.5                151811.5
head(overpaid_employees_kmodes, n=10)
##      work_year experience_level employment_type                          job_title salary_currency salary_in_usd employee_residence remote_ratio company_location company_size salary_group employee_region company_region industry  boss      role research cluster salary_difference_kmodes predicted_salary_kmodes
## 1094      2020               MI              FT                 Research Scientist             USD        450000                 US       Office               US            M           1M   North America  North America    OTHER FALSE SCIENTIST     TRUE       2                 360008.3                89991.67
## 1151      2021               MI              FT             Financial Data Analyst             USD        450000                 US       Remote               US            L           1M   North America  North America     DATA FALSE   ANALYST    FALSE      16                 349633.1               100366.87
## 1279      2021               EX              FT            Principal Data Engineer             USD        600000                 US       Remote               US            L           1M   North America  North America     DATA  TRUE  ENGINEER    FALSE       5                 341503.7               258496.29
## 1325      2021               MI              FT Applied Machine Learning Scientist             USD        423000                 US       Hybrid               US            L           1M   North America  North America    ML/AI FALSE SCIENTIST    FALSE      20                 296616.8               126383.19
## 1328      2020               SE              FT                     Data Scientist             USD        412000                 US       Remote               US            L           1M   North America  North America     DATA FALSE SCIENTIST    FALSE      20                 285616.8               126383.19
## 1040      2022               SE              FT             Applied Data Scientist             USD        380000                 US       Remote               US            L           1M   North America  North America     DATA FALSE SCIENTIST    FALSE      10                 228812.2               151187.78
## 1182      2020               SE              FT         Machine Learning Scientist             USD        260000                 JP       Office               JP            S           1M            Asia           Asia    ML/AI FALSE SCIENTIST    FALSE      13                 204334.7                55665.33
## 123       2022               SE              FT                     Data Architect             USD        345600                 US       Office               US            M           1M   North America  North America     DATA FALSE ARCHITECT    FALSE      19                 193788.5               151811.50
## 1188      2021               SE              FT          Machine Learning Engineer             USD        256000                 US       Remote               US            S           1M   North America  North America    ML/AI FALSE  ENGINEER    FALSE       6                 180524.2                75475.76
## 981       2022               EX              FT                      Data Engineer             USD        324000                 US       Remote               US            M           1M   North America  North America     DATA FALSE  ENGINEER    FALSE      10                 172812.2               151187.78
model <- lm(salary_in_usd ~ experience_level + employment_type + employee_region + company_size + industry + role + boss + research + remote_ratio, data = df_Q7)
summary(model)
## 
## Call:
## lm(formula = salary_in_usd ~ experience_level + employment_type + 
##     employee_region + company_size + industry + role + boss + 
##     research + remote_ratio, data = df_Q7)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -144040  -27814   -4622   24566  359318 
## 
## Coefficients:
##                                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                    107605.8    18420.5   5.842 6.52e-09 ***
## experience_levelEX              91027.7     9256.9   9.834  < 2e-16 ***
## experience_levelMI              21839.8     5033.8   4.339 1.54e-05 ***
## experience_levelSE              44618.7     5042.9   8.848  < 2e-16 ***
## employment_typeFL              -58306.6    27880.7  -2.091  0.03670 *  
## employment_typeFT              -22824.1    17458.1  -1.307  0.19132    
## employment_typePT              -18941.1    22108.2  -0.857  0.39174    
## employee_regionSouth America   -76319.0     9958.8  -7.663 3.51e-14 ***
## employee_regionCentral America -12658.5    17384.3  -0.728  0.46665    
## employee_regionEurope          -63878.0     4041.1 -15.807  < 2e-16 ***
## employee_regionAsia            -78396.7     6321.1 -12.402  < 2e-16 ***
## employee_regionAfrica          -52863.3    18875.1  -2.801  0.00517 ** 
## employee_regionOceania         -34816.1    15700.4  -2.218  0.02676 *  
## company_sizeM                   -4965.7     3461.9  -1.434  0.15170    
## company_sizeS                  -12079.2     5508.6  -2.193  0.02850 *  
## industryML/AI                   12211.7     4480.8   2.725  0.00651 ** 
## industryOTHER                     223.2     6380.2   0.035  0.97210    
## roleARCHITECT                   41632.7     8364.9   4.977 7.32e-07 ***
## roleDEVELOPER                   21884.2    13437.0   1.629  0.10363    
## roleENGINEER                    27845.9     4039.5   6.893 8.46e-12 ***
## roleMANAGER                     41158.1     7674.3   5.363 9.67e-08 ***
## roleOTHER                       14733.2     9380.2   1.571  0.11650    
## roleSCIENTIST                   32090.9     4019.9   7.983 3.11e-15 ***
## bossTRUE                        38375.5     7798.9   4.921 9.73e-07 ***
## researchTRUE                     6031.9    11500.5   0.524  0.60003    
## remote_ratioHybrid              -9612.7     5362.0  -1.793  0.07324 .  
## remote_ratioRemote              -1348.6     2994.1  -0.450  0.65249    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 48350 on 1305 degrees of freedom
## Multiple R-squared:  0.4728, Adjusted R-squared:  0.4623 
## F-statistic: 45.02 on 26 and 1305 DF,  p-value: < 2.2e-16
df_Q7$predicted_salary_lm  <- predict(model, newdata = df_Q7)
df_Q7$salary_difference_lm <- df_Q7$salary_in_usd - df_Q7$predicted_salary_lm

# Use the salary difference column to identify underpaid and overpaid employees
underpaid_employees_lm <- df_Q7[df_Q7$salary_difference_lm < 0, ]
overpaid_employees_lm <- df_Q7[df_Q7$salary_difference_lm > 0, ]

underpaid_employees_lm <- underpaid_employees_lm[order(underpaid_employees_lm$salary_difference_lm), ]
overpaid_employees_lm <- overpaid_employees_lm[order(overpaid_employees_lm$salary_difference_lm, decreasing = TRUE), ]

# View the underpaid and overpaid employees
head(underpaid_employees_lm, n = 10)
##      work_year experience_level employment_type                     job_title salary_currency salary_in_usd employee_residence remote_ratio company_location company_size salary_group employee_region company_region industry  boss      role research cluster salary_difference_kmodes predicted_salary_kmodes predicted_salary_lm salary_difference_lm
## 113       2021               SE              FT 3D Computer Vision Researcher             USD          5000                 US       Remote               US            L          25k   North America  North America    OTHER FALSE     OTHER     TRUE      10               -146187.78               151187.78            149040.1            -144040.1
## 958       2022               SE              FT                 Data Engineer             USD         25000                 US       Remote               US            M          25k   North America  North America     DATA FALSE  ENGINEER    FALSE      19               -126811.50               151811.50            150931.9            -125931.9
## 1244      2021               MI              FT                Data Scientist             MXN          2859                 MX       Office               MX            S          25k   North America  North America     DATA FALSE SCIENTIST    FALSE      13                -52806.33                55665.33            126633.1            -123774.1
## 159       2022               SE              FT                Data Scientist             USD         38000                 US       Remote               US            M          50k   North America  North America     DATA FALSE SCIENTIST    FALSE      14               -110237.54               148237.54            155176.9            -117176.9
## 163       2022               SE              FT                Data Scientist             USD         38000                 US       Remote               US            M          50k   North America  North America     DATA FALSE SCIENTIST    FALSE      14               -110237.54               148237.54            155176.9            -117176.9
## 173       2022               SE              FT                Data Scientist             USD         38000                 US       Remote               US            M          50k   North America  North America     DATA FALSE SCIENTIST    FALSE      14               -110237.54               148237.54            155176.9            -117176.9
## 754       2022               EX              FT            Lead Data Engineer             CAD        116073                 CA       Remote               CA            S         125k   North America  North America     DATA  TRUE  ENGINEER    FALSE       5               -142423.29               258496.29            228602.9            -112529.9
## 1167      2020               SE              FT                 Data Engineer             MXN         33511                 MX       Office               MX            S          50k   North America  North America     DATA FALSE  ENGINEER    FALSE      19               -118300.50               151811.50            145167.1            -111656.1
## 114       2022               MI              FT                Data Scientist             USD         30000                 MX       Remote               MX            L          50k   North America  North America     DATA FALSE SCIENTIST    FALSE      20                -96383.19               126383.19            137363.7            -107363.7
## 612       2022               MI              FT                 Data Engineer             USD         24000                 US       Office               US            M          25k   North America  North America     DATA FALSE  ENGINEER    FALSE      19               -127811.50               151811.50            129501.6            -105501.6
head(overpaid_employees_lm, n=10)
##      work_year experience_level employment_type                          job_title salary_currency salary_in_usd employee_residence remote_ratio company_location company_size salary_group employee_region company_region industry  boss      role research cluster salary_difference_kmodes predicted_salary_kmodes predicted_salary_lm salary_difference_lm
## 1279      2021               EX              FT            Principal Data Engineer             USD        600000                 US       Remote               US            L           1M   North America  North America     DATA  TRUE  ENGINEER    FALSE       5                 341503.7               258496.29           240682.16             359317.8
## 1151      2021               MI              FT             Financial Data Analyst             USD        450000                 US       Remote               US            L           1M   North America  North America     DATA FALSE   ANALYST    FALSE      16                 349633.1               100366.87           105272.80             344727.2
## 1094      2020               MI              FT                 Research Scientist             USD        450000                 US       Office               US            M           1M   North America  North America    OTHER FALSE SCIENTIST     TRUE       2                 360008.3                89991.67           140001.66             309998.3
## 1325      2021               MI              FT Applied Machine Learning Scientist             USD        423000                 US       Hybrid               US            L           1M   North America  North America    ML/AI FALSE SCIENTIST    FALSE      20                 296616.8               126383.19           141311.35             281688.7
## 1328      2020               SE              FT                     Data Scientist             USD        412000                 US       Remote               US            L           1M   North America  North America     DATA FALSE SCIENTIST    FALSE      20                 285616.8               126383.19           160142.65             251857.3
## 1035      2022               SE              FT                Data Analytics Lead             USD        405000                 US       Remote               US            L           1M   North America  North America     DATA  TRUE     OTHER    FALSE       5                 146503.7               258496.29           181160.50             223839.5
## 1040      2022               SE              FT             Applied Data Scientist             USD        380000                 US       Remote               US            L           1M   North America  North America     DATA FALSE SCIENTIST    FALSE      10                 228812.2               151187.78           160142.65             219857.3
## 123       2022               SE              FT                     Data Architect             USD        345600                 US       Office               US            M           1M   North America  North America     DATA FALSE ARCHITECT    FALSE      19                 193788.5               151811.50           166067.29             179532.7
## 1182      2020               SE              FT         Machine Learning Scientist             USD        260000                 JP       Office               JP            S           1M            Asia           Asia    ML/AI FALSE SCIENTIST    FALSE      13                 204334.7                55665.33            83227.02             176773.0
## 1250      2021               EX              CT           Principal Data Scientist             USD        416000                 US       Remote               US            S           1M   North America  North America     DATA  TRUE SCIENTIST    FALSE       5                 157503.7               258496.29           255672.04             160328.0
df_Q7$kmodes_ratio_error <- ((df_Q7$predicted_salary_kmodes / df_Q7$salary_in_usd)-1)*100
df_Q7$lm_ratio_error <- ((df_Q7$predicted_salary_lm / df_Q7$salary_in_usd)-1)*100

summary(df_Q7[, c("kmodes_ratio_error","lm_ratio_error")])
##  kmodes_ratio_error lm_ratio_error   
##  Min.   : -80.002   Min.   :-193.78  
##  1st Qu.: -19.918   1st Qu.: -17.07  
##  Median :   4.267   Median :   4.74  
##  Mean   :  49.975   Mean   :  33.21  
##  3rd Qu.:  46.444   3rd Qu.:  32.40  
##  Max.   :5105.239   Max.   :4329.28
quantiles_kmodes <- quantile(df_Q7$kmodes_ratio_error, c(0.25,0.75))
quantiles_lm <- quantile(df_Q7$lm_ratio_error, c(0.25,0.75))
df_Q7 <- df_Q7[order(df_Q7$salary_in_usd),]
# Add a column to the dataframe with a sequence of integers
x <- 1:nrow(df_Q7)

df_Q7$status_lm <- ifelse(df_Q7$lm_ratio_error < quantiles_kmodes[1], "Overpaid", 
                      ifelse(df_Q7$lm_ratio_error < quantiles_kmodes[2], "Normal", "Underpaid"))

df_Q7$status_kmodes <- ifelse(df_Q7$kmodes_ratio_error < quantiles_lm[1], "Overpaid", 
                      ifelse(df_Q7$kmodes_ratio_error < quantiles_lm[2], "Normal", "Underpaid"))
ggplot(df_Q7, aes(x = x, y = salary_in_usd)) +
        geom_point(aes(y = salary_in_usd), size= 2, shape = 20) +
        geom_point(aes(y = predicted_salary_lm, color = status_lm), size =2,  shape = 20, alpha = 0.9) + 
        scale_color_manual(
          values = c("Overpaid" = "red", "Underpaid" = "orange", "Normal" = "green"), 
          labels = c('Normal', 'Overpaid', 'Underpaid'),
          aesthetics = c("colour", "fill"), 
          name = "Consideration") +
        ggtitle("Plot of salary and predicted salaries using Linear Regression") +
        ylab("Salary in USD") +
        scale_y_continuous(labels = label_comma(), breaks = scales::pretty_breaks(n = 10))

ggplot(df_Q7, aes(x = x, y = salary_in_usd)) +
        geom_point(aes(y = salary_in_usd), size= 2, shape = 20) +
        geom_point(aes(y = predicted_salary_kmodes, color = status_kmodes), size =2,  shape = 20, alpha = 0.9) + 
        scale_color_manual(
          values = c("Overpaid" = "red", "Underpaid" = "orange", "Normal" = "green"), 
          labels = c('Normal', 'Overpaid', 'Underpaid'),
          aesthetics = c("colour", "fill"),
          name = "Consideration") +
        ggtitle("Plot of salary and predicted salaries using Clustering") +
        ylab("Salary in USD") +
        labs(fill="Consideration") +
        scale_y_continuous(labels = label_comma(), breaks = scales::pretty_breaks(n = 10))

df_Q7$status_lm <- as.factor(df_Q7$status_lm)
df_Q7$status_kmodes <- as.factor(df_Q7$status_kmodes)


ct <- table(df_Q7$status_lm,df_Q7$status_kmodes)
ct <- ct[c(3,1,2),c(3,1,2)]
ct <- melt(ct)
## Warning in type.convert.default(X[[i]], ...): 'as.is' should be specified by the caller; using TRUE

## Warning in type.convert.default(X[[i]], ...): 'as.is' should be specified by the caller; using TRUE
colnames(ct) <- c("LinearRegression", "Clustering", "Count")

ggplot(ct, aes(x = LinearRegression, y = Clustering, fill = Count)) +
  geom_tile() +
  scale_fill_gradient2(low = "#075AFF", mid = "#FFFFCC", high = "#FF0000") +
  geom_text(aes(label = Count), color = "black", size = 4) +
  coord_fixed() +
  ggtitle("Comparison between Clustering and Linear Regression") 

underpaid_lm <- df_Q7[df_Q7$status_lm=="Underpaid",]
overpaid_lm <- df_Q7[df_Q7$status_lm=="Overpaid",]

underpaid_kmodes <- df_Q7[df_Q7$status_kmodes =="Underpaid",]
overpaid_kmodes  <- df_Q7[df_Q7$status_kmodes =="Overpaid",]
variables <- c("experience_level", "remote_ratio", "salary_group", "employee_region", "company_region", "role", "boss", "research", "company_size", "industry")

for (i in 1:length(variables)){
  ratio_df <- table(df_Q7[, c(variables[i])])/nrow(df_Q7)
  ratio_up_lm <- table(underpaid_lm[, c(variables[i])])/nrow(underpaid_lm)
  ratio_op_lm <- table(overpaid_lm[, c(variables[i])])/nrow(overpaid_lm)

  # plot them in such a way that they can be compared
  df_long_lm <- melt(data.frame(ratio_up_lm, ratio_df, ratio_op_lm))
  
  ratio_up_kmodes <- table(underpaid_kmodes[, c(variables[i])])/nrow(underpaid_kmodes)
  ratio_op_kmodes <- table(overpaid_kmodes[, c(variables[i])])/nrow(overpaid_kmodes)

  # plot them in such a way that they can be compared
  df_long_kmodes <- melt(data.frame(ratio_up_kmodes, ratio_df, ratio_op_kmodes))

  print(ggplot(data = df_long_lm, aes(x = variable, y = value , fill = Var1, label = round(value, 2))) +
    geom_bar(stat = "identity", position = "fill") +
    geom_text(size = 3, position = position_stack(vjust = 0.5)) +
    scale_x_discrete(labels = c("Freq"= "Underpaid", "Freq.1"= "Normal", "Freq.2" = "Overpaid")) +
    labs(fill=variables[i], title=paste("Stacked percent bar chart of ",variables[i], " (LM)"), x="Consideration", y = "%"))
  
   print(ggplot(data = df_long_kmodes, aes(x = variable, y = value , fill = Var1, label = round(value, 2))) +
    geom_bar(stat = "identity", position = "fill") +
    geom_text(size = 3, position = position_stack(vjust = 0.5)) +
    scale_x_discrete(labels = c("Freq"= "Underpaid", "Freq.1"= "Normal", "Freq.2" = "Overpaid")) +
    labs(fill=variables[i], title=paste("Stacked percent bar chart of ",variables[i], " (Clustering)"), x="Consideration", y = "%"))
}
## Using Var1, Var1.1, Var1.2 as id variables
## Using Var1, Var1.1, Var1.2 as id variables

## Using Var1, Var1.1, Var1.2 as id variables
## Using Var1, Var1.1, Var1.2 as id variables

## Using Var1, Var1.1, Var1.2 as id variables
## Using Var1, Var1.1, Var1.2 as id variables

## Using Var1, Var1.1, Var1.2 as id variables
## Using Var1, Var1.1, Var1.2 as id variables

## Using Var1, Var1.1, Var1.2 as id variables
## Using Var1, Var1.1, Var1.2 as id variables

## Using Var1, Var1.1, Var1.2 as id variables
## Using Var1, Var1.1, Var1.2 as id variables

## Using Var1, Var1.1, Var1.2 as id variables
## Using Var1, Var1.1, Var1.2 as id variables

## Using Var1, Var1.1, Var1.2 as id variables
## Using Var1, Var1.1, Var1.2 as id variables

## Using Var1, Var1.1, Var1.2 as id variables
## Using Var1, Var1.1, Var1.2 as id variables

## Using Var1, Var1.1, Var1.2 as id variables
## Using Var1, Var1.1, Var1.2 as id variables

3. Employee Seniority: Upward mobility and career growth

Can we predict an employee’s experience level based on their salary and job title? Using classification algorithms, we could build a model that predicts an employee’s experience level based on their salary and job title. This could be useful for employees and job seekers who want to understand what experience level they can expect to be paid for a given salary and job title. We could also include other factors, such as location and company size, to improve the accuracy of the model.

For this we’ll use a Decision Tree since we are interested in having a clear way to visualize and understand the result. We want to classify the experience level of employees based on attributes such as their role, industry for which they work, salary, etcetera.

BASE MODEL: Predict experience level with only salary and job title; since job title is a factor with numerous levels industry and role will be used instead.

df14 = df[, c("industry", "role", "salary_in_usd", "experience_level")]
dt14 = rpart(experience_level~., data=df14)
rpart.plot(dt14)

ADVANCED MODEL: Predict experience level with all of the variables available (apart from those which factor level exceeds 32).

df14v2 = df[ , -which(names(df) %in% c("job_title", "work_year", "ID", "company_location", "employee_residence", "employee_region", "company_region", "salary_group", "salary_currency", "remote_ratio"))]
summary(df14v2)
##  experience_level employment_type salary_in_usd    company_size  industry       boss                role      research      
##  EN:147           CT:   8         Min.   :  2324   L:322        DATA :1076   Mode :logical   ANALYST  :232   Mode :logical  
##  EX: 45           FL:   5         1st Qu.: 75593   M:898        ML/AI: 157   FALSE:1276      ARCHITECT: 40   FALSE:1306     
##  MI:337           FT:1306         Median :120000   S:112        OTHER:  99   TRUE :56        DEVELOPER: 16   TRUE :26       
##  SE:803           PT:  13         Mean   :123375                                             ENGINEER :527                  
##                                   3rd Qu.:164997                                             MANAGER  : 50                  
##                                   Max.   :600000                                             OTHER    : 41                  
##                                                                                              SCIENTIST:426
dt14v2 = rpart(experience_level~., data=df14v2)
rpart.plot(dt14v2)

3.1. How accurately can we predict experience_level within the company using a k-nearest neighbors classification model, while excluding the salary attribute from the dataset?

set.seed(123)
df_no_s <- df
df_no_s$salary_group <- NULL
df_no_s$salary_currency <- NULL
df_no_s$salary_in_usd <- NULL
train_index <- createDataPartition(df_no_s$experience_level, p = 0.7, list = FALSE)
train_data <- df_no_s[train_index, ]
test_data <- df_no_s[-train_index, ]
model <- train(experience_level ~ ., data = train_data, method = "knn", tuneGrid = expand.grid(k = 1:20))
print(model$bestTune$k)
## [1] 1
summary(model)
##             Length Class      Mode     
## learn         2    -none-     list     
## k             1    -none-     numeric  
## theDots       0    -none-     list     
## xNames      214    -none-     character
## problemType   1    -none-     character
## tuneValue     1    data.frame list     
## obsLevels     4    -none-     character
## param         0    -none-     list
importance <- varImp(model)
plot(importance)

predictions <- predict(model, test_data)
cMatrix <- confusionMatrix(predictions, test_data$experience_level)
cMatrix
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  EN  EX  MI  SE
##         EN  11   0  11   5
##         EX   0   3   1   2
##         MI  16   3  51  15
##         SE  17   7  38 218
## 
## Overall Statistics
##                                           
##                Accuracy : 0.7111          
##                  95% CI : (0.6638, 0.7551)
##     No Information Rate : 0.603           
##     P-Value [Acc > NIR] : 4.532e-06       
##                                           
##                   Kappa : 0.4374          
##                                           
##  Mcnemar's Test P-Value : NA              
## 
## Statistics by Class:
## 
##                      Class: EN Class: EX Class: MI Class: SE
## Sensitivity            0.25000  0.230769    0.5050    0.9083
## Specificity            0.95480  0.992208    0.8855    0.6076
## Pos Pred Value         0.40741  0.500000    0.6000    0.7786
## Neg Pred Value         0.91105  0.974490    0.8403    0.8136
## Prevalence             0.11055  0.032663    0.2538    0.6030
## Detection Rate         0.02764  0.007538    0.1281    0.5477
## Detection Prevalence   0.06784  0.015075    0.2136    0.7035
## Balanced Accuracy      0.60240  0.611489    0.6952    0.7580
cMatrix$byClass[1]
## [1] 0.25

Failed 1. How much should the salary increase be when you get promoted to a higher Experience Level? (according to similar colleagues with the same job area).

This could be useful for employees that are going to recieve a promotion, and want to know how much they should get paid for their new position. It could also be useful for employers that want to know how much they should offer new hires or promoting employees.

f <- ggplot(df, aes(x=reorder(experience_level,salary_in_usd), y=salary_in_usd))+
     geom_boxplot()+
     ggtitle("Salary Distribution for each experience level") + ylab("Salary (USD)") + xlab("")
f

As we can see, from ENtry level to EXecutive level, as the experience increases, the larger the average salary gets. Lets see if we can find, using clustering, 3 zones for each

custom_line_plot <- function(datos,avgs, title){
  plot1 <- ggplot()+
            geom_violin(data=datos, aes(x=reorder(experience_level,salary_in_usd), y = salary_in_usd))+ 
  geom_line(data=avgs,aes(x=Experience_Level, y=Average_Salary, group=1))+
  geom_point(data=avgs,aes(x=Experience_Level, y=Average_Salary, group=1))+
  geom_text(data=avgs,aes(x=Experience_Level, y=Average_Salary, label = round(Average_Salary,0)) ,hjust=-0.2, vjust=-1)+
  scale_y_continuous(breaks = scales::pretty_breaks(n = 15))+
  labs(title=title)
  plot1
}

custom_line_plot(df_data,data_avg,"Data Industry")

custom_line_plot(df_ai,ai_avg,"AI/ML Industry")

custom_line_plot(df_other, other_avg,"Other Industries")

get_increases <- function(dframe){
  dframe$increase <- 0
dframe$increase[dframe$Experience_Level=="MI"] <- dframe$Average_Salary[dframe$Experience_Level=="MI"] - dframe$Average_Salary[dframe$Experience_Level=="EN"]
dframe$increase[dframe$Experience_Level=="SE"] <- dframe$Average_Salary[dframe$Experience_Level=="SE"] - dframe$Average_Salary[dframe$Experience_Level=="MI"]
dframe$increase[dframe$Experience_Level=="EX"] <- dframe$Average_Salary[dframe$Experience_Level=="EX"] - dframe$Average_Salary[dframe$Experience_Level=="SE"]

dframe$increasePercentage <- 0
dframe$increasePercentage[dframe$Experience_Level=="MI"] <- dframe$Average_Salary[dframe$Experience_Level=="MI"] / dframe$Average_Salary[dframe$Experience_Level=="EN"]
dframe$increasePercentage[dframe$Experience_Level=="SE"] <- dframe$Average_Salary[dframe$Experience_Level=="SE"] / dframe$Average_Salary[dframe$Experience_Level=="MI"]
dframe$increasePercentage[dframe$Experience_Level=="EX"] <- dframe$Average_Salary[dframe$Experience_Level=="EX"] / dframe$Average_Salary[dframe$Experience_Level=="SE"]

dframe$increasePercentage <- ifelse(dframe$Experience_Level=="EN",0,(dframe$increasePercentage - 1) * 100)

return(dframe)
}

data_avg <- get_increases(data_avg)
ai_avg <- get_increases(ai_avg)
other_avg <- get_increases(other_avg)

data_avg
##   Experience_Level Average_Salary increase increasePercentage
## 1               EN       57413.78     0.00            0.00000
## 2               MI       86994.15 29580.37           51.52138
## 3               SE      144961.30 57967.15           66.63339
## 4               EX      205371.24 60409.94           41.67315
ai_avg
##   Experience_Level Average_Salary increase increasePercentage
## 1               EN        65501.3     0.00           0.000000
## 2               MI       108864.9 43363.55          66.202585
## 3               SE       151329.1 42464.23          39.006374
## 4               EX       164241.8 12912.66           8.532837
other_avg
##   Experience_Level Average_Salary increase increasePercentage
## 1               EN       68638.79     0.00            0.00000
## 2               MI       94903.37 26264.58           38.26493
## 3               SE      133025.39 38122.02           40.16930
## 4               EX      158747.00 25721.61           19.33586

Failed 2. What is more worth, to work in the office at your region or remotely in another region?

First we should have a look at how much are employees paid while working in the office at their same region.

df_filtered <- filter(df, employee_region == company_region)

print(ggplot(aes(x = employee_region, y = salary_in_usd, fill=employee_region), data = df_filtered) +
    stat_summary(fun=mean, geom="bar", position = "stack") +
    stat_summary(aes(label=round(after_stat(y),2)), fun.y=mean, geom="text", size=3, vjust = -0.5) + 
    xlab("Region") + labs(title="Salary of any work in the same Region"))
## Warning: The `fun.y` argument of `stat_summary()` is deprecated as of ggplot2 3.3.0.
## ℹ Please use the `fun` argument instead.

Here we can see that most of the Regions are underpaid for office work compared to North America, especially the rest of america.

Could they earn more if they work remotely for another Region?

Lets have a look at how much do companies in each region pay for Remote work.

df_filtered <- filter(df, employee_region != company_region & remote_ratio == "Remote")

print(ggplot(aes(x = company_region, y = salary_in_usd, fill=company_region), data = df_filtered) +
    stat_summary(fun=mean, geom="bar", position = "stack") +
    stat_summary(aes(label=round(after_stat(y),2)), fun.y=mean, geom="text", size=3, vjust = -0.5) + 
    xlab("Region") + labs(title="Salary of Remote work in another Region"))

We can see that by average North America and Europe are close, while the rest are much lower, especially Asia. The regions that do not appear, don’t employ remote workers from another regions.

In relation to the high paying, we can see that USA pays less for remote work, than for office work. On the other hand in Europe the salary is on the same level for remote and office work.

Now we can focus on answering the question.

aux <- data.frame(matrix(0, nrow = length(levels(df$company_region)), ncol = length(levels(df$company_region))))
colnames(aux) <- levels(df$company_region)
rownames(aux) <- levels(df$company_region)

for (i in 1:nrow(aux)) {
  max = 0
  for (j in 1:ncol(aux)) {
    df_filtered <- filter(df, company_region == colnames(aux)[j] & employee_region == rownames(aux)[i])
    aux[i, j] <- mean(df_filtered$salary_in_usd)
  }
}

aux_clean <- data.frame(matrix(0, nrow = length(levels(df$company_region)), ncol = length(levels(df$company_region))))
colnames(aux_clean) <- levels(df$company_region)
rownames(aux_clean) <- levels(df$company_region)

aux <- replace(aux, is.na(aux), 0)

for (i in 1:nrow(aux_clean)) {
  max <- 0
  for (j in 1:ncol(aux_clean)) {
    #if(i == j) {
     # aux_clean[i,j] <- aux[i,j]
    #} else 
    if(aux[i,j]>max){
      max <- aux[i,j]
      aux_clean[i,j] <- aux[i,j]
    }
  }
}


data_long <- aux_clean %>%
  rownames_to_column %>%
  gather(key = 'key', value = 'value', -rowname) %>%
  filter(value > 0)
colnames(data_long) <- c("source", "target", "value")
data_long$target <- paste(data_long$target, " ", sep="")

nodes <- data.frame(name=c(as.character(data_long$source), as.character(data_long$target)) %>% unique())
 
data_long$IDsource=match(data_long$source, nodes$name)-1 
data_long$IDtarget=match(data_long$target, nodes$name)-1

sankeyNetwork(Links = data_long, Nodes = nodes,
                     Source = "IDsource", Target = "IDtarget",
                     Value = "value", NodeID = "name", 
                     sinksRight=FALSE, nodeWidth=40, fontSize=13, nodePadding=20)

Failed 3. Can we identify salary disparities based on residence and other demographic factors?

For this we’ll use visualization techniques with the ggplot library. The width of the boxes depends on the amount of samples each class/value has so we can extract better conclusions out of them.

ggplot(df, aes(x = as.factor(work_year), y = salary_in_usd)) +
         geom_boxplot(varwidth=T, fill="lightblue") +
         labs(title = "Salary depending on the work year", x = "Year", y = "Salary in USD") +
         theme_minimal() + 
         scale_y_continuous(labels = scales::dollar_format())

ggplot(df, aes(x = experience_level, y = salary_in_usd)) +
         geom_boxplot(varwidth=T, fill="lightblue") +
         labs(title = "Salary depending on the experience level", x = "Experience level", y = "Salary in USD") +
         theme_minimal() + 
         scale_y_continuous(labels = scales::dollar_format())

ggplot(df, aes(x = employment_type, y = salary_in_usd)) +
         geom_boxplot(varwidth=T, fill="lightblue") +
         labs(title = "Salary depending on the employment type", x = "Employment type", y = "Salary in USD") +
         theme_minimal() + 
         scale_y_continuous(labels = scales::dollar_format())

ggplot(df, aes(x = employee_region, y = salary_in_usd)) +
         geom_boxplot(varwidth=T, fill="lightblue") +
         labs(title = "Salary depending on the employee region of residence", x = "Region of residence", y = "Salary in USD") +
         theme_minimal() + 
         scale_y_continuous(labels = scales::dollar_format())

ggplot(df, aes(x = company_size, y = salary_in_usd)) +
         geom_boxplot(varwidth=T, fill="lightblue") +
         labs(title = "Salary depending on the company size", x = "Company size", y = "Salary in USD") +
         theme_minimal() + 
         scale_y_continuous(labels = scales::dollar_format())

ggplot(df, aes(x = role, y = salary_in_usd)) +
         geom_boxplot(varwidth=T, fill="lightblue") +
         labs(title = "Salary depending on the role", x = "Role", y = "Salary in USD") +
         theme_minimal() + 
         scale_y_continuous(labels = scales::dollar_format())

ggplot(df, aes(x = industry, y = salary_in_usd)) +
         geom_boxplot(varwidth=T, fill="lightblue") +
         labs(title = "Salary depending on the industry", x = "Industry", y = "Salary in USD") +
         theme_minimal() + 
         scale_y_continuous(labels = scales::dollar_format())

ggplot(df, aes(x = boss, y = salary_in_usd)) +
         geom_boxplot(varwidth=T, fill="lightblue") +
         labs(title = "Salary depending on whether the employee has a leadership position", x = "Has a leadership position", y = "Salary in USD") +
         theme_minimal() + 
         scale_y_continuous(labels = scales::dollar_format())

ggplot(df, aes(x = research, y = salary_in_usd)) +
         geom_boxplot(varwidth=T, fill="lightblue") +
         labs(title = "Salary depending on whether the employee participates in research projects", x = "Works in research projects", y = "Salary in USD") +
         theme_minimal() + 
         scale_y_continuous(labels = scales::dollar_format())

Let’s go through the different plots and check if we can identify any disparity:

  • Work year: we can observe how the salaries seem to be growing year by year - mostly in 2022. Probably due to the inflation rates. Further we can check which is the amount of growth in salaries discounting the inflation.
  • Experience level: as expected, more experienced professionals earn a higher salary.
  • Employment type: we can’t say much about contract, freelance or part time jobs since the amount of samples we have is quite reduced; but probably contracts generate higher revenue than a full time job.
  • Employee region of residence: we can observe how the income in north american countries is higher than in the other regions, can’t say much about other regions apart of NA and Europe since the dataset is mostly composed of these regions data.
  • Company size: we can clearly see that smaller companies usually offer lower salaries while medium and large companies have higher wages. The difference between medium and large ones is mostly the same but the impressive salaries usually come from large enterprises (makes sense think about Google, Meta, Netflix, Amazon…).
  • Role: better paid roles are architects, engineers, managers and scientists while developers have the lower wages from all the roles.
  • Industry: in terms of the company industry we can’t say much about it since data and ML/AI companies are quite similar when it comes to wages.
  • Leadership position: we can clearly observe how having a leadership position in the company usually implies a higher salary in return.
  • Involved in research projects: on the other hand, participating in research projects seems to be not so interesting, at least from the financial perspective, since most other people earn higher salaries than the ones that participate in this kind of projects.
increase = (mean(df[df$work_year == 2022, 'salary_in_usd'])/mean(df[df$work_year == 2021, 'salary_in_usd'])) - 1
net_increase = increase - 0.077 # official inflation rate in 2022 in the US
net_increase
## [1] 0.2717521

It seems like the increase in salaries between 2021 and 2022 is not explained only with the inflation, since the increase has been of over a 34% in total and the official 2022 inflation rate that the US government gives is ay 7.7%. The net salary increase taking that into account (most samples of the data are US based), net salaries have increased a 27.17%.

Failed 4. Join cost of living index dataset into problem dataset

We only want the country and the cost of living and we delete Kosovo from the list since it is a disputed territory and luckily enough we don’t have any Kosovo related data in our dataset

cost_of_living_df = read.csv("Cost_of_Living_Index_2022.csv", stringsAsFactors = TRUE)[-67, c(2,3)]
cost_of_living_df = setNames(cost_of_living_df, c("country", "cost_of_living_index"))
cost_of_living_df$country = countrycode(cost_of_living_df$country, origin = 'country.name', destination = 'iso2c')

Add the cost of living index columns of the employee residence and the company location deleting those rows that contain either American Samoa or Aaland Islands, since we don’t have information about the cost of living index in these countries.

dfN<-subset(df, employee_residence!="AX" & employee_residence!="AS" & company_location!="AX" & company_location!="AS")
dfN$cost_of_living_index_employee = rep(0, nrow(dfN))
dfN$cost_of_living_index_company = rep(0, nrow(dfN))
for(i in 1:nrow(dfN)) {
  country_employee = dfN[i, ]$employee_residence
  country_company = dfN[i, ]$company_location
  index_employee =  cost_of_living_df[as.character(cost_of_living_df$country)==as.character(country_employee), ]$cost_of_living_index
  index_company = cost_of_living_df[as.character(cost_of_living_df$country)==as.character(country_company), ]$cost_of_living_index
  dfN[i,]$cost_of_living_index_employee = index_employee
  dfN[i,]$cost_of_living_index_company = index_company
}

Let’s find the employees that live in a cheaper country than the one that they work for.

subset(dfN, cost_of_living_index_employee>cost_of_living_index_company)
##      work_year experience_level employment_type                          job_title salary_currency salary_in_usd employee_residence remote_ratio company_location company_size salary_group employee_region company_region industry  boss      role research cost_of_living_index_employee cost_of_living_index_company
## 439       2022               EN              FT                     Data Scientist             USD         50000                 US       Hybrid               DE            M          50k   North America         Europe     DATA FALSE SCIENTIST    FALSE                         70.13                        65.58
## 440       2022               EN              FT                       Data Analyst             INR          6413                 FR       Remote               IN            L          25k          Europe           Asia     DATA FALSE   ANALYST    FALSE                         74.13                        24.43
## 643       2022               SE              FT Applied Machine Learning Scientist             EUR         77280                 FR       Remote               GB            L         100k          Europe         Europe    ML/AI FALSE SCIENTIST    FALSE                         74.13                        69.65
## 745       2022               MI              FT          Machine Learning Engineer             EUR         84229                 FR       Remote               DE            M         100k          Europe         Europe    ML/AI FALSE  ENGINEER    FALSE                         74.13                        65.58
## 1015      2022               EN              FT                     Data Scientist             USD         40000                 JP       Remote               MY            L          50k            Asia           Asia     DATA FALSE SCIENTIST    FALSE                         77.03                        37.02
## 1051      2022               MI              FL                     Data Scientist             USD        100000                 CA       Remote               US            M         100k   North America  North America     DATA FALSE SCIENTIST    FALSE                         70.22                        70.13
## 1073      2021               MI              FT                  Big Data Engineer             USD         60000                 ES       Hybrid               RO            M          75k          Europe         Europe     DATA FALSE  ENGINEER    FALSE                         53.88                        35.24
## 1113      2021               EX              FT           Director of Data Science             EUR        153667                 IT       Remote               PL            L         200k          Europe         Europe     DATA  TRUE     OTHER    FALSE                         66.47                        38.95
## 1194      2021               EN              FT                 Research Scientist             USD        100000                 JE       Office               CN            L         100k          Europe           Asia    OTHER FALSE SCIENTIST     TRUE                         92.02                        41.77
## 1202      2021               EN              FT          Machine Learning Engineer             USD         85000                 NL       Remote               DE            S         100k          Europe         Europe    ML/AI FALSE  ENGINEER    FALSE                         75.66                        65.58
## 1233      2021               EX              FT            Data Science Consultant             EUR         69741                 FR       Remote               ES            S          75k          Europe         Europe     DATA FALSE     OTHER    FALSE                         74.13                        53.88
## 1268      2021               MI              FT                      Data Engineer             GBP         66022                 HK       Hybrid               GB            S          75k            Asia         Europe     DATA FALSE  ENGINEER    FALSE                         80.71                        69.65
## 1277      2021               SE              FT                 Research Scientist             USD         50000                 FR       Remote               US            S          50k          Europe  North America    OTHER FALSE SCIENTIST     TRUE                         74.13                        70.13
subset(dfN, as.character(company_location)!=as.character(employee_residence))
##      work_year experience_level employment_type                          job_title salary_currency salary_in_usd employee_residence remote_ratio company_location company_size salary_group employee_region company_region industry  boss      role research cost_of_living_index_employee cost_of_living_index_company
## 68        2022               EN              PT                       Data Analyst             USD        125404                 CN       Hybrid               US            S         150k            Asia  North America     DATA FALSE   ANALYST    FALSE                         41.77                        70.13
## 157       2022               SE              FT             Product Data Scientist             USD          8000                 IN       Remote               SG            L          25k            Asia           Asia     DATA FALSE SCIENTIST    FALSE                         24.43                        83.98
## 271       2022               EN              FT                       Data Analyst             USD         20000                 CR       Hybrid               US            M          25k Central America  North America     DATA FALSE   ANALYST    FALSE                         47.01                        70.13
## 315       2022               EN              FT                    Data Specialist             USD        105000                 CL       Remote               US            L         125k   South America  North America     DATA FALSE     OTHER    FALSE                         43.90                        70.13
## 357       2022               EN              FT                       AI Scientist             EUR         31586                 PT       Remote               ES            M          50k          Europe         Europe    ML/AI FALSE SCIENTIST    FALSE                         47.94                        53.88
## 373       2022               MI              CT                       NLP Engineer             USD         60000                 IN       Remote               US            S          75k            Asia  North America    OTHER FALSE  ENGINEER    FALSE                         24.43                        70.13
## 403       2022               SE              FT           Data Engineering Manager             USD        193000                 ES       Remote               US            M         200k          Europe  North America     DATA FALSE   MANAGER    FALSE                         53.88                        70.13
## 439       2022               EN              FT                     Data Scientist             USD         50000                 US       Hybrid               DE            M          50k   North America         Europe     DATA FALSE SCIENTIST    FALSE                         70.13                        65.58
## 440       2022               EN              FT                       Data Analyst             INR          6413                 FR       Remote               IN            L          25k          Europe           Asia     DATA FALSE   ANALYST    FALSE                         74.13                        24.43
## 500       2022               MI              FT Applied Machine Learning Scientist             USD         75000                 BO       Remote               US            M          75k   South America  North America    ML/AI FALSE SCIENTIST    FALSE                         34.77                        70.13
## 538       2022               EN              PT                     Data Scientist             USD        110000                 DO       Remote               FR            M         125k Central America         Europe     DATA FALSE SCIENTIST    FALSE                         41.77                        74.13
## 631       2022               MI              FT                     Data Scientist             INR         53877                 IN       Remote               ID            L          75k            Asia           Asia     DATA FALSE SCIENTIST    FALSE                         24.43                        35.85
## 636       2022               SE              FT                 Analytics Engineer             USD         48000                 AR       Remote               US            S          50k   South America  North America    OTHER FALSE  ENGINEER    FALSE                         34.69                        70.13
## 643       2022               SE              FT Applied Machine Learning Scientist             EUR         77280                 FR       Remote               GB            L         100k          Europe         Europe    ML/AI FALSE SCIENTIST    FALSE                         74.13                        69.65
## 726       2022               MI              FT                       AI Scientist             USD        200000                 IN       Remote               US            L         200k            Asia  North America    ML/AI FALSE SCIENTIST    FALSE                         24.43                        70.13
## 745       2022               MI              FT          Machine Learning Engineer             EUR         84229                 FR       Remote               DE            M         100k          Europe         Europe    ML/AI FALSE  ENGINEER    FALSE                         74.13                        65.58
## 796       2022               SE              FT              Data Science Engineer             USD         60000                 AR       Remote               MX            L          75k   South America  North America     DATA FALSE  ENGINEER    FALSE                         34.69                        35.35
## 999       2022               MI              FT Applied Machine Learning Scientist             USD         75000                 BO       Remote               US            L          75k   South America  North America    ML/AI FALSE SCIENTIST    FALSE                         34.77                        70.13
## 1006      2022               EN              FT           Computer Vision Engineer             USD         10000                 PT       Remote               LU            M          25k          Europe         Europe    OTHER FALSE  ENGINEER    FALSE                         47.94                        80.50
## 1009      2022               SE              FT                       Head of Data             USD        200000                 MY       Remote               US            M         200k            Asia  North America     DATA  TRUE     OTHER    FALSE                         37.02                        70.13
## 1015      2022               EN              FT                     Data Scientist             USD         40000                 JP       Remote               MY            L          50k            Asia           Asia     DATA FALSE SCIENTIST    FALSE                         77.03                        37.02
## 1018      2020               EN              FT                      Data Engineer             EUR         54742                 PK       Remote               DE            L          75k            Asia         Europe     DATA FALSE  ENGINEER    FALSE                         19.92                        65.58
## 1023      2022               SE              FT                     Data Scientist             USD        100000                 BR       Remote               US            M         100k   South America  North America     DATA FALSE SCIENTIST    FALSE                         33.24                        70.13
## 1028      2022               EN              CT Applied Machine Learning Scientist             EUR         30533                 TN       Remote               CZ            M          50k          Africa         Europe    ML/AI FALSE SCIENTIST    FALSE                         27.87                        48.24
## 1030      2022               EN              FT                      Data Engineer             EUR         55591                 PK       Remote               DE            M          75k            Asia         Europe     DATA FALSE  ENGINEER    FALSE                         19.92                        65.58
## 1044      2022               MI              FT                     Data Scientist             USD         48000                 RU       Remote               US            S          50k          Europe  North America     DATA FALSE SCIENTIST    FALSE                         35.26                        70.13
## 1051      2022               MI              FL                     Data Scientist             USD        100000                 CA       Remote               US            M         100k   North America  North America     DATA FALSE SCIENTIST    FALSE                         70.22                        70.13
## 1057      2021               MI              FT Applied Machine Learning Scientist             USD         38400                 VN       Remote               US            M          50k            Asia  North America    ML/AI FALSE SCIENTIST    FALSE                         37.48                        70.13
## 1073      2021               MI              FT                  Big Data Engineer             USD         60000                 ES       Hybrid               RO            M          75k          Europe         Europe     DATA FALSE  ENGINEER    FALSE                         53.88                        35.24
## 1089      2020               SE              FT                     Data Scientist             EUR         68428                 GR       Remote               US            L          75k          Europe  North America     DATA FALSE SCIENTIST    FALSE                         56.22                        70.13
## 1108      2021               EN              PT                       AI Scientist             USD         12000                 PK       Remote               US            M          25k            Asia  North America    ML/AI FALSE SCIENTIST    FALSE                         19.92                        70.13
## 1111      2021               MI              FT             Applied Data Scientist             CAD         54238                 GB       Hybrid               CA            L          75k          Europe  North America     DATA FALSE SCIENTIST    FALSE                         69.65                        70.22
## 1113      2021               EX              FT           Director of Data Science             EUR        153667                 IT       Remote               PL            L         200k          Europe         Europe     DATA  TRUE     OTHER    FALSE                         66.47                        38.95
## 1119      2021               SE              FT                       Data Analyst             USD         80000                 BG       Remote               US            S         100k          Europe  North America     DATA FALSE   ANALYST    FALSE                         38.38                        70.13
## 1126      2020               MI              FT                     Data Scientist             USD         45760                 PH       Remote               US            S          50k            Asia  North America     DATA FALSE SCIENTIST    FALSE                         37.06                        70.13
## 1130      2021               EX              FT                    BI Data Analyst             USD        150000                 IN       Remote               US            L         150k            Asia  North America     DATA FALSE   ANALYST    FALSE                         24.43                        70.13
## 1135      2021               SE              FT             Marketing Data Analyst             EUR         88654                 GR       Remote               DK            L         100k          Europe         Europe     DATA FALSE   ANALYST    FALSE                         56.22                        84.12
## 1146      2021               MI              FT                    BI Data Analyst             HUF         36259                 HU       Hybrid               US            L          50k          Europe  North America     DATA FALSE   ANALYST    FALSE                         40.66                        70.13
## 1149      2021               MI              FT  Computer Vision Software Engineer             EUR         95746                 DE       Remote               US            S         100k          Europe  North America    OTHER FALSE  ENGINEER    FALSE                         65.58                        70.13
## 1157      2021               SE              FT                Cloud Data Engineer             USD        160000                 BR       Remote               US            S         200k   South America  North America     DATA FALSE  ENGINEER    FALSE                         33.24                        70.13
## 1194      2021               EN              FT                 Research Scientist             USD        100000                 JE       Office               CN            L         100k          Europe           Asia    OTHER FALSE SCIENTIST     TRUE                         92.02                        41.77
## 1202      2021               EN              FT          Machine Learning Engineer             USD         85000                 NL       Remote               DE            S         100k          Europe         Europe    ML/AI FALSE  ENGINEER    FALSE                         75.66                        65.58
## 1210      2021               SE              FT                 Lead Data Engineer             USD        160000                 PR       Hybrid               US            S         200k Central America  North America     DATA  TRUE  ENGINEER    FALSE                         66.53                        70.13
## 1211      2021               MI              FT                     Data Scientist             EUR         25532                 RS       Remote               DE            S          50k          Europe         Europe     DATA FALSE SCIENTIST    FALSE                         36.21                        65.58
## 1222      2021               SE              FT               Data Science Manager             INR         54094                 IN       Hybrid               US            L          75k            Asia  North America     DATA FALSE   MANAGER    FALSE                         24.43                        70.13
## 1233      2021               EX              FT            Data Science Consultant             EUR         69741                 FR       Remote               ES            S          75k          Europe         Europe     DATA FALSE     OTHER    FALSE                         74.13                        53.88
## 1234      2021               SE              FT            Data Analytics Engineer             USD         50000                 VN       Remote               GB            M          50k            Asia         Europe     DATA FALSE  ENGINEER    FALSE                         37.48                        69.65
## 1238      2021               MI              FT                      Data Engineer             EUR         26005                 RO       Office               US            L          50k          Europe  North America     DATA FALSE  ENGINEER    FALSE                         35.24                        70.13
## 1241      2021               MI              FT                     Data Scientist             INR          5679                 IN       Remote               US            S          25k            Asia  North America     DATA FALSE SCIENTIST    FALSE                         24.43                        70.13
## 1246      2021               MI              FT                     Data Scientist             EUR         61467                 DE       Hybrid               AT            M          75k          Europe         Europe     DATA FALSE SCIENTIST    FALSE                         65.58                        71.04
## 1251      2021               SE              FT         Machine Learning Scientist             USD        225000                 US       Remote               CA            L         250k   North America  North America    ML/AI FALSE SCIENTIST    FALSE                         70.13                        70.22
## 1258      2020               SE              FT                  Big Data Engineer             EUR        114047                 PL       Remote               GB            S         125k          Europe         Europe     DATA FALSE  ENGINEER    FALSE                         38.95                        69.65
## 1261      2020               MI              FT                 Lead Data Engineer             USD         56000                 PT       Remote               US            M          75k          Europe  North America     DATA  TRUE  ENGINEER    FALSE                         47.94                        70.13
## 1268      2021               MI              FT                      Data Engineer             GBP         66022                 HK       Hybrid               GB            S          75k            Asia         Europe     DATA FALSE  ENGINEER    FALSE                         80.71                        69.65
## 1272      2021               MI              FL                      Data Engineer             USD         20000                 IT       Office               US            L          25k          Europe  North America     DATA FALSE  ENGINEER    FALSE                         66.47                        70.13
## 1277      2021               SE              FT                 Research Scientist             USD         50000                 FR       Remote               US            S          50k          Europe  North America    OTHER FALSE SCIENTIST     TRUE                         74.13                        70.13
## 1282      2021               EN              FT                  Big Data Engineer             INR          5882                 IN       Office               CH            L          25k            Asia         Europe     DATA FALSE  ENGINEER    FALSE                         24.43                       123.35
## 1294      2021               SE              FT                      Data Engineer             EUR         76833                 RO       Hybrid               GB            S         100k          Europe         Europe     DATA FALSE  ENGINEER    FALSE                         35.24                        69.65
## 1296      2021               SE              FT               Data Science Manager             USD        152000                 US       Remote               FR            L         200k   North America         Europe     DATA FALSE   MANAGER    FALSE                         70.13                        74.13
## 1298      2020               SE              FL           Computer Vision Engineer             USD         60000                 RU       Remote               US            S          75k          Europe  North America    OTHER FALSE  ENGINEER    FALSE                         35.26                        70.13
## 1309      2020               MI              FT                     Data Scientist             EUR         62726                 FR       Hybrid               LU            S          75k          Europe         Europe     DATA FALSE SCIENTIST    FALSE                         74.13                        80.50
## 1313      2021               EN              PT                       AI Scientist             USD         12000                 BR       Remote               US            S          25k   South America  North America    ML/AI FALSE SCIENTIST    FALSE                         33.24                        70.13
## 1321      2020               MI              FT                      Data Engineer             USD        130800                 ES       Remote               US            M         150k          Europe  North America     DATA FALSE  ENGINEER    FALSE                         53.88                        70.13
## 1324      2021               MI              FT                     Data Scientist             SGD        119059                 SG       Remote               IL            M         125k            Asia           Asia     DATA FALSE SCIENTIST    FALSE                         83.98                        88.05